Treball Final de Master
20241 - Màster universitari en Ciència de dades (Data science)
Estudis d'Informàtica, Multimedia i Telecomunicacions
Arnau Gusart Verdú
Analysis and prediction of soil humidity in small holding vineyards.¶
This notebook contains the code and some light explanations for the TFM project.
# Libraries Used
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import itertools
import re
import seaborn as sns
import scipy.stats as stats
from scipy.stats import anderson
from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline
# Per carregar les dades
from google.colab import drive
import os
drive.mount('/content/drive')
os.listdir('/content/drive/My Drive/')
os.chdir('/content/drive/MyDrive/Colab Notebooks/TFM/')
Mounted at /content/drive
1.- Creation of the dataset¶
Creation of datasets for each sensor using the distinct .csv files.
# Path to the folder containing CSV files
folder_path = '/content/drive/MyDrive/Colab Notebooks/TFM/data/'
# Initialize a dictionary to store DataFrames for each sensor
sensor_dataframes = {}
# Loop through all files
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'): # only CSV files
# Extract sensor code and column name with regex
match = re.match(r"(.+?)_(.+)\.csv", file_name)
if match:
sensor_code = match.group(1)
column_name = match.group(2)
# Check if its one of the 4 nodes of interest
if sensor_code not in ('46005a000351353337353037', '270043001951343334363036', '380033001951343334363036', '4e0031000251353337353037'):
continue
else:
print(f"Skipping file {file_name} due to invalid file name format.")
continue
file_path = os.path.join(folder_path, file_name)
if column_name == 'wind_direction_raw':
# Handle wind direction files (it has a different format)
temp_df = pd.read_csv(file_path, names=['time', 'sensor_code', 'direction', 'range'])
temp_df = temp_df[['time', 'direction', 'range']]
else:
# Default format
temp_df = pd.read_csv(file_path, names=['time', column_name])
if sensor_code not in sensor_dataframes:
sensor_dataframes[sensor_code] = []
temp_df['time'] = pd.to_datetime(temp_df['time'], errors='coerce')
sensor_dataframes[sensor_code].append(temp_df)
# Merge all sensor DataFrames by time
for sensor_code, dfs in sensor_dataframes.items():
merged_df = pd.concat(dfs, axis=1).loc[:,~pd.concat(dfs, axis=1).columns.duplicated()]
merged_df = merged_df.loc[:,~merged_df.columns.str.contains('^time')]
merged_df['time'] = dfs[0]['time']
sensor_dataframes[sensor_code] = merged_df
for sensor_code, df in sensor_dataframes.items():
print(f"Data for sensor {sensor_code}:")
print(df.head(), "\n")
<ipython-input-8-2305fb0d75d8>:39: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy temp_df['time'] = pd.to_datetime(temp_df['time'], errors='coerce')
Data for sensor 46005a000351353337353037:
soil_humidity_raw humidity_raw atmospheric_pressure_raw \
0 3883.0 54.516357 984.222473
1 304.0 54.302734 984.257507
2 3117.0 57.072205 984.377502
3 3219.0 58.575195 0.000000
4 3285.0 69.553894 983.794983
air_temperature_raw precipitation_raw battery_raw direction range \
0 31.378862 0.0 86.203125 1.535022e+09 -1.0
1 31.078560 0.0 85.769531 1.535022e+09 -1.0
2 30.102575 0.0 82.703125 1.535023e+09 -1.0
3 29.512695 0.0 83.031250 1.535024e+09 -1.0
4 28.654688 0.0 82.703125 1.535030e+09 0.0
wind_speed_raw time
0 0.00000 2018-08-23 13:07:11.783
1 0.00000 2018-08-23 13:17:37.242
2 0.00000 2018-08-23 13:37:40.126
3 2.41206 2018-08-23 15:05:16.805
4 0.00000 2018-08-23 15:15:39.859
Data for sensor 380033001951343334363036:
air_temperature_raw humidity_raw soil_humidity_raw precipitation_raw \
0 21.286545 40.318054 1300.0 0.0
1 21.329445 41.080994 1313.0 0.0
2 20.878992 41.554016 1297.0 0.0
3 19.945908 46.047729 1238.0 0.0
4 19.699230 45.666260 1265.0 0.0
battery_raw atmospheric_pressure_raw direction range wind_speed_raw \
0 91.488281 977.619995 1.523645e+09 -1.0 0.0
1 77.484375 0.000000 1.523645e+09 -1.0 0.0
2 81.726563 977.812500 1.523645e+09 -1.0 0.0
3 82.542969 978.000000 1.523646e+09 -1.0 0.0
4 81.890625 978.292480 1.523646e+09 -1.0 0.0
time
0 2018-04-13 20:36:33.568
1 2018-04-13 20:38:05.143
2 2018-04-13 20:48:29.220
3 2018-04-13 20:58:28.707
4 2018-04-13 21:08:29.311
Data for sensor 270043001951343334363036:
humidity_raw atmospheric_pressure_raw precipitation_raw battery_raw \
0 38.952393 0.0 0.0 100.191406
1 35.084290 0.0 0.0 91.488281
2 37.579102 0.0 0.0 91.488281
3 39.272827 0.0 0.0 91.488281
4 58.025879 0.0 0.0 92.437500
air_temperature_raw direction range soil_humidity_raw \
0 25.941238 1.525081e+09 -1.0 1290.0
1 22.509207 1.525082e+09 -1.0 1420.0
2 21.329445 1.525373e+09 7.0 1326.0
3 20.578690 1.525373e+09 7.0 1209.0
4 22.712984 1.525434e+09 1.0 1341.0
wind_speed_raw time
0 0.0 2018-04-05 20:55:43.617
1 0.0 2018-04-05 20:56:56.027
2 0.0 2018-04-05 20:58:34.785
3 0.0 2018-04-05 21:08:04.830
4 0.0 2018-04-05 21:15:01.636
Data for sensor 4e0031000251353337353037:
direction range soil_humidity_raw wind_speed_raw battery_raw \
0 1.525954e+09 -1.0 903.0 0.0 87.925781
1 1.525955e+09 3.0 801.0 0.0 87.925781
2 1.525955e+09 -1.0 1239.0 0.0 87.492188
3 1.525956e+09 3.0 3264.0 0.0 91.488281
4 1.525956e+09 4.0 2.0 0.0 91.648438
atmospheric_pressure_raw humidity_raw air_temperature_raw \
0 -9.990000 -5.809265 -46.581871
1 -9.990000 -5.809265 -46.581871
2 -9.990000 -5.809265 -46.581871
3 0.000000 -5.809265 -46.581871
4 1006.349976 56.927246 28.686863
precipitation_raw time
0 0.0 2018-05-10 14:10:54.707
1 0.0 2018-05-10 14:18:17.130
2 0.0 2018-05-10 14:28:40.367
3 0.0 2018-05-10 14:38:40.287
4 0.0 2018-05-10 14:41:00.514
# Check for null values for each sensor
for sensor_code, df in sensor_dataframes.items():
print(f"Null values in DataFrame for sensor {sensor_code} ({len(df.index)} rows):")
print(df.isnull().sum())
print("\n")
Null values in DataFrame for sensor 46005a000351353337353037 (5828 rows): soil_humidity_raw 29 humidity_raw 4 atmospheric_pressure_raw 20 air_temperature_raw 0 precipitation_raw 8 battery_raw 35 direction 11 range 11 wind_speed_raw 16 time 29 dtype: int64 Null values in DataFrame for sensor 380033001951343334363036 (19906 rows): air_temperature_raw 0 humidity_raw 51 soil_humidity_raw 426 precipitation_raw 96 battery_raw 538 atmospheric_pressure_raw 285 direction 150 range 150 wind_speed_raw 200 time 0 dtype: int64 Null values in DataFrame for sensor 270043001951343334363036 (20214 rows): humidity_raw 34 atmospheric_pressure_raw 291 precipitation_raw 69 battery_raw 571 air_temperature_raw 0 direction 295 range 295 soil_humidity_raw 476 wind_speed_raw 194 time 34 dtype: int64 Null values in DataFrame for sensor 4e0031000251353337353037 (21789 rows): direction 118 range 118 soil_humidity_raw 309 wind_speed_raw 166 battery_raw 366 atmospheric_pressure_raw 214 humidity_raw 46 air_temperature_raw 0 precipitation_raw 82 time 118 dtype: int64
Extraction of null counts for a table.
table_data = []
for sensor_code, df in sensor_dataframes.items():
null_counts = df.isnull().sum()
row = {
"Sensor Code": sensor_code,
"Row Count": len(df),
}
# Add null counts
for column, null_count in null_counts.items():
row[column] = null_count
table_data.append(row)
table_df = pd.DataFrame(table_data)
transposed_table_df = table_df.set_index("Sensor Code").T
# Export the transposed table to LaTeX
with open("/content/drive/MyDrive/Colab Notebooks/TFM/tables/null_table.tex", "w") as f:
f.write(transposed_table_df.to_latex(index=True, header=True, caption="Transposed Table of Sensor Null Values"))
Vertical join of the four distinct sensor dfs into one.
vertical_concat_dfs = []
for sensor_code, df in sensor_dataframes.items():
df['sensor_code'] = sensor_code
vertical_concat_dfs.append(df)
# Join dfs
merged_df = pd.concat(vertical_concat_dfs, axis=0)
cols = ['time'] + ['sensor_code'] + [col for col in merged_df.columns if col != 'sensor_code' and col != 'time']
merged_df = merged_df[cols]
# Drop rows with any null values
merged_df = merged_df.dropna()
# Reset the index for clarity
merged_df.reset_index(drop=True, inplace=True)
# Define the sensor code map (we use simple labels)
sensor_code_map = {
'380033001951343334363036': 'A',
'270043001951343334363036': 'B',
'4e0031000251353337353037': 'C',
'46005a000351353337353037': 'D'
}
merged_df['sensor_code'] = merged_df['sensor_code'].replace(sensor_code_map)
print("Merged DataFrame:")
print(merged_df.head())
print(len(merged_df.index))
Merged DataFrame:
time sensor_code soil_humidity_raw humidity_raw \
0 2018-08-23 13:07:11.783 D 3883.0 54.516357
1 2018-08-23 13:17:37.242 D 304.0 54.302734
2 2018-08-23 13:37:40.126 D 3117.0 57.072205
3 2018-08-23 15:05:16.805 D 3219.0 58.575195
4 2018-08-23 15:15:39.859 D 3285.0 69.553894
atmospheric_pressure_raw air_temperature_raw precipitation_raw \
0 984.222473 31.378862 0.0
1 984.257507 31.078560 0.0
2 984.377502 30.102575 0.0
3 0.000000 29.512695 0.0
4 983.794983 28.654688 0.0
battery_raw direction range wind_speed_raw
0 86.203125 1.535022e+09 -1.0 0.00000
1 85.769531 1.535022e+09 -1.0 0.00000
2 82.703125 1.535023e+09 -1.0 0.00000
3 83.031250 1.535024e+09 -1.0 2.41206
4 82.703125 1.535030e+09 0.0 0.00000
66227
2.- Data Analysis¶
First of all we proceed in removing redundant columns, nulls and undefined measurements.
# Count rows where 'range' is -1 (-1 means undefined)
count_range_minus_1 = merged_df[merged_df['range'] == -1].shape[0]
print(f"Number of rows with range = -1: {count_range_minus_1}")
# Remove rows
merged_df = merged_df[merged_df['range'] != -1]
print(merged_df.head())
Number of rows with range = -1: 6392
time sensor_code soil_humidity_raw humidity_raw \
4 2018-08-23 15:15:39.859 D 3285.0 69.553894
5 2018-08-23 15:25:39.571 D 3301.0 62.313599
6 2018-08-23 15:35:40.761 D 3288.0 60.963196
7 2018-08-23 15:45:40.334 D 3298.0 57.766479
8 2018-08-23 15:55:39.974 D 3362.0 58.743042
atmospheric_pressure_raw air_temperature_raw precipitation_raw \
4 983.794983 28.654688 0.0000
5 983.719971 30.939133 3.0734
6 983.729980 30.585205 0.0000
7 986.539978 28.761938 0.0000
8 985.567505 29.941700 1.9558
battery_raw direction range wind_speed_raw
4 82.703125 1.535030e+09 0.0 0.000000
5 82.542969 1.535030e+09 2.0 0.000000
6 82.050781 1.535031e+09 3.0 0.000000
7 82.378906 1.535031e+09 3.0 0.000000
8 83.355469 1.535032e+09 3.0 4.824121
# Drop the 'direction' column (its redundant)
merged_df = merged_df.drop(columns=['direction'], errors='ignore')
summary = merged_df.describe() # Extraction for a table
transposed_summary = summary.T
# Export the summary table to a LaTeX file
with open("/content/drive/MyDrive/Colab Notebooks/TFM/tables/merged_df_description.tex", "w") as f:
f.write(transposed_summary.to_latex(index=True, caption="Summary Statistics of Merged DataFrame"))
Different kinds of plots to observe patterns in data.
# Plot the number of measurements per day
df_copy = merged_df.copy()
df_copy['date'] = df_copy['time'].dt.date
daily_recordings = df_copy.groupby('date').size()
plt.figure(figsize=(10, 6))
daily_recordings.plot()
monthly_ticks = pd.date_range(start=daily_recordings.index.min(),
end=daily_recordings.index.max(),
freq='MS')
plt.xticks(monthly_ticks, rotation=45)
plt.title('Number of Recordings per Day')
plt.xlabel('Date')
plt.ylabel('Number of Recordings')
plt.grid(True)
plt.tight_layout()
plt.show()
Here we observe the number of measurements per day. We see an almost even distribution of arround 400 measurements starting on may.
# Plot of the number of measurements per wind direction
range_labels = {
0: 'North',
1: 'NE',
2: 'East',
3: 'SE',
4: 'South',
5: 'SW',
6: 'West',
7: 'NW'
}
df_copy['range_label'] = df_copy['range'].map(range_labels)
# Preserve the order for the plot
df_copy['range_label'] = pd.Categorical(df_copy['range_label'], categories=['North', 'NE', 'East', 'SE', 'South', 'SW', 'West', 'NW'], ordered=True)
plt.figure(figsize=(10, 6))
range_counts = df_copy['range_label'].value_counts().sort_index()
range_counts.plot(kind='bar', color='skyblue')
plt.title('Frequency of Different Range Values')
plt.xlabel('Range Direction')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
There is an almost even distribution of data in all categories, having some more measurements for SE and S and noticeably less in NW.
# Select numeric columns excluding 'sensor_code', 'time', and 'range'
numeric_columns = df_copy.select_dtypes(include=['float64', 'int64']).columns.difference(['sensor_code', 'time', 'range'])
n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols # Calculate number of rows needed for the multiplot
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
axes = axes.flatten()
for i, col in enumerate(numeric_columns):
# Plot the histogram
axes[i].hist(df_copy[col], bins=30, color='skyblue', edgecolor='black', alpha=0.7, density=True)
# Plot the KDE line
sns.kdeplot(df_copy[col], ax=axes[i], color='blue', linewidth=0.5, bw_adjust=0.5)
axes[i].set_title(f'Histogram of {col}')
axes[i].set_xlabel(col)
axes[i].set_ylabel('Density')
# Hide empty subplots
for i in range(len(numeric_columns), len(axes)):
axes[i].axis('off')
plt.tight_layout()
plt.show()
Rough examination to see if the data has normal distribution. We see that it does not appear to be the case.
# Perform Anderson-Darling test for each numeric column
anderson_results = {}
for col in numeric_columns:
result = anderson(df_copy[col].dropna()) # Drop null values before the test
anderson_results[col] = {'statistic': result.statistic, 'critical_values': result.critical_values, 'significance_level': result.significance_level}
for col, result in anderson_results.items():
print(f'{col}: Anderson-Darling Statistic = {result["statistic"]:.4f}')
for i, cv in enumerate(result["critical_values"]):
print(f' {result["significance_level"][i]}% critical value: {cv:.4f}')
if result["statistic"] < result["critical_values"][2]: # Compare with the 5% critical value
print(f'{col} appears to be normally distributed.')
else:
print(f'{col} does not appear to be normally distributed.')
air_temperature_raw: Anderson-Darling Statistic = 655.1556 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 air_temperature_raw does not appear to be normally distributed. atmospheric_pressure_raw: Anderson-Darling Statistic = 9851.7832 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 atmospheric_pressure_raw does not appear to be normally distributed. battery_raw: Anderson-Darling Statistic = 1588.0518 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 battery_raw does not appear to be normally distributed. humidity_raw: Anderson-Darling Statistic = 463.4194 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 humidity_raw does not appear to be normally distributed. precipitation_raw: Anderson-Darling Statistic = 22565.9625 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 precipitation_raw does not appear to be normally distributed. soil_humidity_raw: Anderson-Darling Statistic = 2941.1726 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 soil_humidity_raw does not appear to be normally distributed. wind_speed_raw: Anderson-Darling Statistic = 3732.6588 15.0% critical value: 0.5760 10.0% critical value: 0.6560 5.0% critical value: 0.7870 2.5% critical value: 0.9180 1.0% critical value: 1.0920 wind_speed_raw does not appear to be normally distributed.
After applying the Anderson-Darling test (better fit than Shapiro-Wilk as the dataset is bigger), it confirms that none of the numerical columns follow a Normal distribution.
# Columns of interest
columns_to_plot = ['battery_raw', 'humidity_raw', 'soil_humidity_raw', 'wind_speed_raw']
fig, axs = plt.subplots(2, 2, figsize=(12, 12))
axs = axs.flatten()
for i, col in enumerate(columns_to_plot):
stats.probplot(df_copy[col].dropna(), dist="norm", plot=axs[i])
axs[i].set_title(f'QQ Plot for {col}')
plt.tight_layout()
plt.show()
We double check with a QQ-plots and see that in the extremes the data differs greatly from the normal "expectation".
# Calculate the number of rows and columns for the grid
n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 10))
axes = axes.flatten()
# Create a box plot for each variable
for i, col in enumerate(numeric_columns):
sns.boxplot(data=df_copy[col], ax=axes[i])
axes[i].set_title(col)
for i in range(len(numeric_columns), len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
We notice that there are outliers with unrealistic values in atmospheric pressure so we remove the rows. (Setting the minimum to the lowest measured pressure in Spain).
# Remove outliers with unrealistic values
merged_df = merged_df[merged_df['atmospheric_pressure_raw'] >= 950]
df_copy = df_copy[df_copy['atmospheric_pressure_raw'] >= 950]
# Calculate the number of rows and columns for the grid
n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 10))
axes = axes.flatten()
# Create a box plot for each variable
for i, col in enumerate(numeric_columns):
sns.boxplot(data=df_copy[col], ax=axes[i])
axes[i].set_title(col)
for i in range(len(numeric_columns), len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
We see that it also seems to have removed some temperature outliers.
sensor_order = ['A', 'B', 'C', 'D']
df_copy['sensor_code'] = pd.Categorical(df_copy['sensor_code'], categories=sensor_order, ordered=True)
n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 10))
axes = axes.flatten()
# Define color
palette = sns.color_palette("Set2", len(sensor_order))
# Create a box plot for each numeric column, separated by 'sensor_code'
for i, col in enumerate(numeric_columns):
sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette)
axes[i].set_title(f'{col} by Sensor Code')
for i in range(len(numeric_columns), len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
<ipython-input-13-7ba028a74ecb>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_copy['sensor_code'] = pd.Categorical(df_copy['sensor_code'], categories=sensor_order, ordered=True) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette) <ipython-input-13-7ba028a74ecb>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_copy, x='sensor_code', y=col, ax=axes[i], palette=palette)
We repeat the boxplots now using the differenciating by sensor to check for different behaviours. Like for sensor C and soil humidity, and A and atmospheric pressure. We also check for differences in categorical variables.
# Plot the count of each range value by sensor code
plt.figure(figsize=(10, 6))
sns.countplot(data=df_copy, x='range', hue='sensor_code', palette=palette)
plt.title('Range Counts by Sensor Code')
plt.xlabel('Sensor Code')
plt.ylabel('Count')
plt.legend(title='Range', loc='upper right')
plt.tight_layout()
plt.show()
<ipython-input-140-13bfd7c1284c>:13: UserWarning: The palette list has more values (24) than needed (4), which may not be intended. sns.countplot(data=df_copy, x='range', hue='sensor_code', palette=palette)
# Calculate correlation matrix
correlation_matrix = numerical_columns.corr()
# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix (Excluding Range and Sensor Code)')
plt.show()
# List of numerical variables to perform ANOVA to check statistical significance between variables
numerical_columns = ['soil_humidity_raw', 'humidity_raw', 'atmospheric_pressure_raw',
'air_temperature_raw', 'precipitation_raw', 'battery_raw', 'wind_speed_raw']
# Perform ANOVA test for each numerical variable grouped by 'sensor_code' and 'range'
anova_results = {}
# ANOVA for 'sensor_code'
for column in numerical_columns:
groups = [merged_df[merged_df['sensor_code'] == sensor_code][column]
for sensor_code in merged_df['sensor_code'].unique()]
f_stat, p_val = f_oneway(*groups)
anova_results[f'sensor_code_{column}'] = {'F-statistic': f_stat, 'p-value': p_val}
# ANOVA for 'range'
for column in numerical_columns:
groups = [merged_df[merged_df['range'] == range_value][column]
for range_value in merged_df['range'].unique()]
f_stat, p_val = f_oneway(*groups)
anova_results[f'range_{column}'] = {'F-statistic': f_stat, 'p-value': p_val}
# Print the results
alpha = 0.05
print("ANOVA Results (alpha = 0.05):\n")
for test, result in anova_results.items():
f_stat = result['F-statistic']
p_val = result['p-value']
print(f"{test}:")
print(f" F-statistic: {f_stat:.3f}")
print(f" p-value: {p_val:.3f}")
if p_val < alpha:
print(f" Conclusion: Statistically significant (p < {alpha})\n")
else:
print(f" Conclusion: Not statistically significant (p >= {alpha})\n")
ANOVA Results (alpha = 0.05): sensor_code_soil_humidity_raw: F-statistic: 26237.474 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) sensor_code_humidity_raw: F-statistic: 1631.196 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) sensor_code_atmospheric_pressure_raw: F-statistic: 48036.502 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) sensor_code_air_temperature_raw: F-statistic: 77.009 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) sensor_code_precipitation_raw: F-statistic: 17.430 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) sensor_code_battery_raw: F-statistic: 438.931 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) sensor_code_wind_speed_raw: F-statistic: 1185.008 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) range_soil_humidity_raw: F-statistic: 420.145 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) range_humidity_raw: F-statistic: 112.378 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) range_atmospheric_pressure_raw: F-statistic: 231.258 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) range_air_temperature_raw: F-statistic: 338.231 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) range_precipitation_raw: F-statistic: 2.309 p-value: 0.024 Conclusion: Statistically significant (p < 0.05) range_battery_raw: F-statistic: 17.182 p-value: 0.000 Conclusion: Statistically significant (p < 0.05) range_wind_speed_raw: F-statistic: 60.491 p-value: 0.000 Conclusion: Statistically significant (p < 0.05)
Both sensor code and range appear to have statistical significance and regecting the Null hypothesis.
We take houly averages to check for time related behaviour, the different lines represent different days.
merged_df['time'] = merged_pd.to_datetime(merged_df['time'])
merged_df['date'] = merged_df['time'].dt.date # Extract the date (yyyy-mm-dd)
merged_df['hour'] = merged_df['time'].dt.hour # Extract the hour (0-23)
# Calculate hourly averages
hourly_avg = merged_df.groupby(['date', 'hour']).mean().reset_index()
hourly_avg = hourly_avg[hourly_avg['atmospheric_pressure_raw'] >= 950]
hourly_avg = hourly_avg[hourly_avg['atmospheric_pressure_raw'] <= 1050]
# Color palette
bright_palette = sns.color_palette("Set2", n_colors=(len(hourly_avg['date'].unique()) // 10) + 1)
hourly_avg['date'] = pd.to_datetime(hourly_avg['date'])
plt.figure(figsize=(15, 12))
for i, column in enumerate(numeric_columns):
if column == 'hour': continue
plt.subplot(3, 3, i+1) # 3x3 grid
sorted_dates = sorted(hourly_avg['date'].unique())
for idx, date in enumerate(sorted_dates):
group_index = idx // 10
color = bright_palette[group_index % len(bright_palette)]
daily_avg = hourly_avg[hourly_avg['date'] == date]
plt.plot(daily_avg['hour'], daily_avg[column], color=color, alpha=0.7)
plt.title(f'Average {column} over Time')
plt.xlabel('Hour of the Day')
plt.ylabel(f'{column.capitalize()}')
plt.xticks(np.arange(0, 24, step=1)) # Display hours from 0 to 23
plt.tight_layout()
plt.show()
Noticeable behaviours:
- Drop in humidity during the day.
- Increase in temperature during the day.
- Constant atmospheric pressure.
- Very herratic precipitation.
Plot of the two variables with the most (negative) correlation.
# Extract the month from the 'time' column
merged_df['month'] = merged_df['time'].dt.month
plt.figure(figsize=(12, 8))
sns.scatterplot(data=merged_df, x='air_temperature_raw', y='humidity_raw', hue='month', palette='viridis')
plt.title('Air Temperature vs. Humidity colored by Month')
plt.xlabel('Air Temperature (raw)')
plt.ylabel('Humidity (raw)')
plt.show()
3.- Data Modeling¶
merged_df.describe()
| time | soil_humidity_raw | humidity_raw | atmospheric_pressure_raw | air_temperature_raw | precipitation_raw | battery_raw | range | wind_speed_raw | |
|---|---|---|---|---|---|---|---|---|---|
| count | 59814 | 59814.000000 | 59814.000000 | 59814.000000 | 59814.000000 | 59814.000000 | 59814.000000 | 59814.000000 | 59814.000000 |
| mean | 2018-07-22 08:35:45.775874560 | 2676.293761 | 56.704038 | 986.223991 | 26.403537 | 0.006213 | 67.029714 | 3.414568 | 3.519540 |
| min | 2018-04-05 21:37:17.255000 | 0.000000 | 11.379761 | 966.104980 | 6.861289 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 2018-06-15 17:08:38.429750016 | 1773.000000 | 40.325684 | 981.310624 | 20.246210 | 0.000000 | 59.476563 | 2.000000 | 0.000000 |
| 50% | 2018-07-23 14:11:53.329499904 | 3022.000000 | 57.079834 | 987.727478 | 24.890179 | 0.000000 | 69.625000 | 3.000000 | 2.412060 |
| 75% | 2018-08-28 23:15:11.474749952 | 3623.000000 | 72.857422 | 990.619995 | 33.116329 | 0.000000 | 79.933594 | 5.000000 | 4.824121 |
| max | 2018-10-09 05:55:17.980000 | 4095.000000 | 103.146118 | 1008.294983 | 46.694302 | 16.205196 | 98.925781 | 7.000000 | 43.417084 |
| std | NaN | 1025.463879 | 19.294970 | 6.153630 | 7.950200 | 0.138051 | 15.993068 | 2.125251 | 4.180872 |
We prepare the data for the training and testing of models.
# Set the objective variable as 'soil_humidity_raw'
obj_variable = 'soil_humidity_raw'
# Extract month and hour from the 'time' variable
merged_df['month'] = merged_df['time'].dt.month
merged_df['hour'] = merged_df['time'].dt.hour
# Separate features (X) and target (y)
X = merged_df.drop(columns=['time', obj_variable])
y = merged_df[obj_variable]
categorical_columns = ['month', 'hour', 'range', 'sensor_code']
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = [col for col in numeric_columns if col not in categorical_columns]
scaler = RobustScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)) # Reshape y to be a 2D array
X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)
y_train = pd.DataFrame({'soil_humidity_raw': np.ravel(y_train)})
y_test = pd.DataFrame({'soil_humidity_raw': np.ravel(y_test)})
# Save the processed data as CSV
X_train.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/train/X_train.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/train/X_test.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/train/y_train.csv', index=False, header=False)
y_test.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/train/y_test.csv', index=False, header=False)
print(X_train.head())
print(y_train)
print(f"X_train size: {X_train.shape}")
print(f"y_train size: {y_train.shape}")
sensor_code humidity_raw atmospheric_pressure_raw \
3417 D 0.799250 0.362269
29807 B -0.054878 -0.039476
62968 C 0.828565 0.546226
36332 B 0.715056 0.258883
33011 B -0.202158 -0.282243
air_temperature_raw precipitation_raw battery_raw range \
3417 -0.095000 0.0 0.487875 7.0
29807 -0.519167 0.0 0.152377 6.0
62968 0.122500 0.0 0.551652 6.0
36332 -0.192500 0.0 0.511743 4.0
33011 0.321667 0.0 0.623449 4.0
wind_speed_raw month hour
3417 0.0 9 11
29807 0.0 6 7
62968 0.0 9 6
36332 -0.5 7 6
33011 1.0 6 19
soil_humidity_raw
0 -0.736216
1 -0.641081
2 0.316216
3 -1.064865
4 -0.612432
... ...
47846 0.369730
47847 -0.502162
47848 0.069189
47849 0.148649
47850 0.445405
[47851 rows x 1 columns]
X_train size: (47851, 10)
y_train size: (47851, 1)
X_train.columns
Index(['sensor_code', 'humidity_raw', 'atmospheric_pressure_raw',
'air_temperature_raw', 'precipitation_raw', 'battery_raw', 'range',
'wind_speed_raw', 'month', 'hour'],
dtype='object')
Function for plotting model results with different sample sizes.
def plot_learning_curves(pipeline, train_X, train_y, cv=3, scoring='r2'):
"""
Function to plot learning curves for a given pipeline.
Parameters:
- pipeline: The machine learning pipeline or model to evaluate.
- train_X: Features for the training set.
- train_y: Target variable for the training set.
- cv: Number of cross-validation splits (default is 5).
- scoring: The scoring metric to evaluate (default is 'neg_mean_squared_error' for regression).
"""
# Calculate learning curve data
train_sizes, train_scores, valid_scores = learning_curve(
pipeline, train_X, train_y, cv=cv, scoring=scoring
)
# Compute mean of training and validation scores
train_scores_mean = train_scores.mean(axis=1)
valid_scores_mean = valid_scores.mean(axis=1)
# Plot the learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, label=f'Training {scoring}', color='tab:blue')
plt.plot(train_sizes, valid_scores_mean, label=f'Validation {scoring}', color='tab:orange')
plt.ylabel(scoring, fontsize=14)
plt.xlabel('Training set size', fontsize=14)
plt.title(f'Learning Curves for {str(pipeline)}', fontsize=10, y=1.03)
plt.legend()
plt.show()
Function for the definition of pipelines for preprocessing and model fitting.
def define_pipeline(categorical_pipeline='default',
binary_pipeline=None,
numerical_pipeline='default',
model=None):
"""
Function to define a pipeline for preprocessing and model fitting.
Parameters:
- categorical_pipeline: The pipeline for categorical data (default is 'default').
- binary_pipeline: The pipeline for binary features (if any) (default is None).
- numerical_pipeline: The pipeline for numerical data (default is 'default').
- model: The model to be used.
Returns:
- The defined pipeline.
"""
# Define the transformers for numerical and categorical data
if numerical_pipeline == 'default':
numerical_pipeline = Pipeline([
('scaler', StandardScaler()) # Standard scaling for numerical features
])
if categorical_pipeline == 'default':
categorical_pipeline = Pipeline([
('onehot', OneHotEncoder(handle_unknown='ignore')) # OneHotEncoder for categorical features
])
# Define column transformer
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_pipeline, ['soil_humidity_raw', 'humidity_raw', 'wind_speed_raw', 'air_temperature_raw']),
('cat', categorical_pipeline, ['month', 'hour', 'range', 'sensor_code'])
])
# Define pipeline with preprocessing and model
clf = Pipeline([
('preprocessor', preprocessor),
('model', model)
])
return clf
Function for hyperparameter optimization.
def optimize_hyperparameters(X_train, y_train, model=None, model_grid_values={}, scoring='r2', cv_splits=3):
"""
Function to optimize hyperparameters for a model pipeline using GridSearchCV.
Parameters:
- X_train: The training feature matrix.
- y_train: The target variable (as a vector, not as a DataFrame).
- model: The model to be optimized.
- model_grid_values: Hyperparameter grid for the model.
- scoring: The scoring metric to optimize (default is 'r2').
- cv_splits: Number of cross-validation splits (default is 5).
Returns:
- The best estimator (model with optimized hyperparameters).
"""
clf = define_pipeline(model=model)
grid_search = GridSearchCV(clf, model_grid_values, cv=cv_splits, scoring=scoring, return_train_score=True)
# Fit the model to the training data (X_train, y_train)
grid_search.fit(X_train, y_train)
# Print the best hyperparameters found
print("Best hyperparameters:")
print(grid_search.best_params_)
best_scores = pd.DataFrame(grid_search.cv_results_)
print("\nAvailable metrics in grid search results:")
print(best_scores.columns)
if f'mean_test_{scoring}' in best_scores.columns:
print(f"\nScores for {scoring}:")
print(f"{scoring}: {best_scores[f'mean_test_{scoring}'].values[0]:.2f}")
else:
print(f"\nScoring metric '{scoring}' not found. Available metrics:")
print(best_scores.columns)
return grid_search.best_estimator_
3.1.- Regression Models¶
numeric_cols = [col for col in X_train.columns if not col.startswith(('month_', 'hour_', 'sensor_code_', 'range_'))]
categorical_cols = [col for col in X_train.columns if col.startswith(('month_', 'hour_', 'sensor_code_', 'range_'))]
# Function to evaluate regression models
def evaluate_regression_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# Metrics for training and testing
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
return {
'Train RMSE': train_rmse,
'Test RMSE': test_rmse,
'Train MAE': train_mae,
'Test MAE': test_mae,
'Train R^2': train_r2,
'Test R^2': test_r2
}
# 1. Standard Linear Regression
scaler = StandardScaler()
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_cols),
('cat', OneHotEncoder(drop='first'), categorical_cols)
])
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
pipeline_results = evaluate_regression_model(pipeline, X_train, y_train, X_test, y_test)
# 2. Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_results = evaluate_regression_model(ridge_model, X_train, y_train, X_test, y_test)
# 3. Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_results = evaluate_regression_model(lasso_model, X_train, y_train, X_test, y_test)
# 4. Logarithmic transformation
log_transformer = FunctionTransformer(np.log1p, validate=True) # Log transform (log(1+x) to avoid log(0))
def safe_log_transform(X):
# Add a small constant to avoid log(0) or negative values
X_safe = np.maximum(X, 1e-6)
return np.log1p(X_safe)
X_train_log = safe_log_transform(X_train[numeric_cols])
X_test_log = safe_log_transform(X_test[numeric_cols])
scaler = StandardScaler()
X_train_log_scaled = scaler.fit_transform(X_train_log)
X_test_log_scaled = scaler.transform(X_test_log)
log_model = LinearRegression()
log_results = evaluate_regression_model(log_model, X_train_log_scaled, y_train, X_test_log_scaled, y_test)
# 5. Results summary table
results = pd.DataFrame({
'Model': ['Linear Regression (Full Pipeline)', 'Ridge Regression', 'Lasso Regression', 'Log Transformed (Scaled)'],
'Train RMSE': [pipeline_results['Train RMSE'], ridge_results['Train RMSE'], lasso_results['Train RMSE'], log_results['Train RMSE']],
'Test RMSE': [pipeline_results['Test RMSE'], ridge_results['Test RMSE'], lasso_results['Test RMSE'], log_results['Test RMSE']],
'Train MAE': [pipeline_results['Train MAE'], ridge_results['Train MAE'], lasso_results['Train MAE'], log_results['Train MAE']],
'Test MAE': [pipeline_results['Test MAE'], ridge_results['Test MAE'], lasso_results['Test MAE'], log_results['Test MAE']],
'Train R^2': [pipeline_results['Train R^2'], ridge_results['Train R^2'], lasso_results['Train R^2'], log_results['Train R^2']],
'Test R^2': [pipeline_results['Test R^2'], ridge_results['Test R^2'], lasso_results['Test R^2'], log_results['Test R^2']]
})
print(results)
Model Train RMSE Test RMSE Train MAE Test MAE Train R^2 Test R^2 0 Linear Regression (Full Pipeline) 0.341115 0.342258 0.252647 0.253646 0.620417 0.622196 1 Ridge Regression 0.341117 0.342261 0.252636 0.253633 0.620415 0.622189 2 Lasso Regression 0.447215 0.449843 0.384432 0.386756 0.347565 0.347349 3 Log Transformed (Scaled) 0.543703 0.547654 0.480762 0.484916 0.035665 0.032679
numerical_features = ['humidity_raw', 'wind_speed_raw', 'air_temperature_raw', 'atmospheric_pressure_raw',
'precipitation_raw', 'battery_raw']
categorical_features = ['month', 'hour', 'range', 'sensor_code']
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
# Define Linear Regression
linear_model = LinearRegression(fit_intercept=False)
# Create the pipeline
optimized_model = Pipeline([
('preprocessor', preprocessor),
('model', linear_model)
])
# Plot learning curves for the optimized model
plot_learning_curves(optimized_model, X_train, y_train, scoring='r2')
optimized_model.fit(X_train, y_train)
# Predict on the test set
y_pred_lr = optimized_model.predict(X_test)
# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
# Print metrics
print(f"Linear Regression Mean Squared Error: {mse_lr}")
print(f"Linear Regression R-squared: {r2_lr}")
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, color='purple', label='Predicted vs Actual (Linear Regression)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression: Actual vs Predicted Values')
plt.legend()
plt.show()
Linear Regression Mean Squared Error: 0.11714067650912165 Linear Regression R-squared: 0.6221958582210162
3.2.- SVM¶
y_train = y_train.values.ravel() # Convert DataFrames to 1D array
y_test = y_test.values.ravel()
# Define the SVR model
svm_model = SVR()
# Set a hyperparameter grid for SVR
svm_grid_values = {
'model__kernel': ['linear', 'poly', 'rbf'], # Kernel functions
'model__C': [0.1, 1, 10],
'model__epsilon': [0.1, 0.2, 0.5],
'model__gamma': ['scale', 'auto']
}
# Perform hyperparameter optimization
optimized_svm_model = optimize_hyperparameters(
X_train,
y_train,
model=svm_model,
model_grid_values=svm_grid_values,
scoring='r2',
cv_splits=3
)
Best hyperparameters:
{'model__C': 1, 'model__epsilon': 0.1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
Available metrics in grid search results:
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_model__C', 'param_model__epsilon', 'param_model__gamma',
'param_model__kernel', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'mean_test_score',
'std_test_score', 'rank_test_score', 'split0_train_score',
'split1_train_score', 'split2_train_score', 'mean_train_score',
'std_train_score'],
dtype='object')
Scoring metric 'r2' not found. Available metrics:
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_model__C', 'param_model__epsilon', 'param_model__gamma',
'param_model__kernel', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'mean_test_score',
'std_test_score', 'rank_test_score', 'split0_train_score',
'split1_train_score', 'split2_train_score', 'mean_train_score',
'std_train_score'],
dtype='object')
# Create a new pipeline with the best hyperparameters
best_svm_pipeline = define_pipeline(
model=SVR(C=1, epsilon=0.1, gamma='scale', kernel='rbf')
)
# Plot learning curves
plot_learning_curves(best_svm_pipeline, X_train, y_train, scoring='r2')
# Fit the pipeline on the training data
best_svm_pipeline.fit(X_train, y_train)
# Predict on the test set
y_pred_svm = best_svm_pipeline.predict(X_test)
# Evaluate
mse_svm = mean_squared_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)
print(f"SVR Mean Squared Error: {mse_svm}")
print(f"SVR R-squared: {r2_svm}")
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_svm, color='green', label='Predicted vs Actual (SVM)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('SVM: Actual vs Predicted Values')
plt.legend()
plt.show()
SVR Mean Squared Error: 0.07126483462658342 SVR R-squared: 0.7701554192149386
3.3.- K-NN¶
# Define the K-NN model
knn_model = KNeighborsRegressor()
# Set a hyperparameter grid for K-NN
knn_grid_values = {
'model__n_neighbors': [3, 5, 10, 15],
'model__weights': ['uniform', 'distance'],
'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
'model__p': [1, 2]
}
# Perform hyperparameter optimization
optimized_knn_model = optimize_hyperparameters(
X_train,
y_train,
model=knn_model,
model_grid_values=knn_grid_values,
scoring='r2',
cv_splits=3
)
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py:598: UserWarning: cannot use tree with sparse input: using brute force
warnings.warn("cannot use tree with sparse input: using brute force")
Best hyperparameters:
{'model__algorithm': 'auto', 'model__n_neighbors': 10, 'model__p': 1, 'model__weights': 'distance'}
Available metrics in grid search results:
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_model__algorithm', 'param_model__n_neighbors', 'param_model__p',
'param_model__weights', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'mean_test_score',
'std_test_score', 'rank_test_score', 'split0_train_score',
'split1_train_score', 'split2_train_score', 'mean_train_score',
'std_train_score'],
dtype='object')
Scoring metric 'r2' not found. Available metrics:
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_model__algorithm', 'param_model__n_neighbors', 'param_model__p',
'param_model__weights', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'mean_test_score',
'std_test_score', 'rank_test_score', 'split0_train_score',
'split1_train_score', 'split2_train_score', 'mean_train_score',
'std_train_score'],
dtype='object')
# Create a new pipeline with the best hyperparameters for KNN
best_knn_pipeline = define_pipeline(
model=KNeighborsRegressor(
algorithm='auto',
n_neighbors=10,
p=1,
weights='distance'
)
)
# Plot learning curves
plot_learning_curves(best_knn_pipeline, X_train, y_train, scoring='r2')
# Predict on the test set
y_pred = best_knn_pipeline.predict(X_test)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()
Mean Squared Error: 0.06807949571774566 R-squared: 0.7804288295160559
3.4.- Random Forest¶
# Define the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
# Set a wide hyperparameter grid
rf_grid_values = {
'model__n_estimators': [50, 100],
'model__max_depth': [10, 20],
'model__min_samples_split': [2, 5],
'model__min_samples_leaf': [1, 2],
'model__max_features': ['sqrt', 'log2'],
'model__bootstrap': [True, False],
'model__criterion': ['squared_error', 'absolute_error']
}
# Perform hyperparameter optimization
optimized_rf_model = optimize_hyperparameters(X_train, y_train, model=rf_model, model_grid_values=rf_grid_values)
Best hyperparameters:
{'model__bootstrap': False, 'model__criterion': 'squared_error', 'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Available metrics in grid search results:
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_model__bootstrap', 'param_model__criterion',
'param_model__max_depth', 'param_model__max_features',
'param_model__min_samples_leaf', 'param_model__min_samples_split',
'param_model__n_estimators', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'split3_test_score',
'split4_test_score', 'mean_test_score', 'std_test_score',
'rank_test_score', 'split0_train_score', 'split1_train_score',
'split2_train_score', 'split3_train_score', 'split4_train_score',
'mean_train_score', 'std_train_score'],
dtype='object')
Scoring metric 'neg_mean_squared_error' not found. Available metrics:
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_model__bootstrap', 'param_model__criterion',
'param_model__max_depth', 'param_model__max_features',
'param_model__min_samples_leaf', 'param_model__min_samples_split',
'param_model__n_estimators', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'split3_test_score',
'split4_test_score', 'mean_test_score', 'std_test_score',
'rank_test_score', 'split0_train_score', 'split1_train_score',
'split2_train_score', 'split3_train_score', 'split4_train_score',
'mean_train_score', 'std_train_score'],
dtype='object')
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
# Define the pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('model', RandomForestRegressor(
bootstrap=False,
criterion='squared_error',
max_depth=20,
max_features='sqrt',
min_samples_leaf=1,
min_samples_split=2,
n_estimators=100,
random_state=42
))
])
# Plot learning curve
plot_learning_curves(pipeline, X_train, y_train, scoring='r2')
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
# Predict with test set
y_pred_rf = pipeline.predict(X_test)
# Evaluate
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R-squared: {r2_rf}")
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='orange', label='Predicted vs Actual (Random Forest)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest: Actual vs Predicted Values')
plt.legend()
plt.show()
Random Forest Mean Squared Error: 0.04234545365710133 Random Forest R-squared: 0.8634267083482557
After noticing the better results from random forest we also check for the most important variables in the model.
# Extract the trained model from the pipeline
rf_model = pipeline.named_steps['model']
# Get feature importances from the model
importances = rf_model.feature_importances_
numerical_feature_names = numerical_features
categorical_feature_names = list(preprocessor.transformers_[1][1].get_feature_names_out(categorical_features))
# Combine the feature names
all_feature_names = numerical_feature_names + categorical_feature_names
# Create a DataFrame with feature names and their corresponding importance scores
feature_importance_df = pd.DataFrame({
'Feature': all_feature_names,
'Importance': importances
})
# Sort
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
Feature Importance 47 sensor_code_C 0.276289 46 sensor_code_B 0.240833 3 atmospheric_pressure_raw 0.087209 0 humidity_raw 0.070584 5 battery_raw 0.051748 2 air_temperature_raw 0.047827 45 sensor_code_A 0.039832 48 sensor_code_D 0.021752 11 month_9 0.019119 1 wind_speed_raw 0.017376 9 month_7 0.016154 7 month_5 0.014279 10 month_8 0.012435 8 month_6 0.008605 43 range_6.0 0.006065 42 range_5.0 0.005622 12 month_10 0.005092 44 range_7.0 0.003708 40 range_3.0 0.003603 38 range_1.0 0.003075 37 range_0.0 0.002417 39 range_2.0 0.002408 4 precipitation_raw 0.002230 22 hour_9 0.002170 6 month_4 0.002132 41 range_4.0 0.002100 23 hour_10 0.001911 21 hour_8 0.001910 24 hour_11 0.001858 26 hour_13 0.001838 27 hour_14 0.001761 31 hour_18 0.001687 25 hour_12 0.001678 28 hour_15 0.001598 13 hour_0 0.001597 30 hour_17 0.001573 34 hour_21 0.001534 20 hour_7 0.001517 29 hour_16 0.001507 32 hour_19 0.001500 33 hour_20 0.001481 35 hour_22 0.001472 19 hour_6 0.001438 18 hour_5 0.001410 36 hour_23 0.001341 16 hour_3 0.001250 14 hour_1 0.001181 17 hour_4 0.001157 15 hour_2 0.001136
# Create the plot with gradient color
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette="viridis")
plt.title('Feature Importances of Random Forest Model', fontsize=16)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tick_params(axis='y', labelsize=6)
plt.show()
<ipython-input-48-bf7221bfe7be>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette="viridis")
# Select top 10
top_10_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_10_features, palette="viridis")
plt.title('Top 10 Feature Importances of Random Forest Model', fontsize=16)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.show()
<ipython-input-45-d9881851b4dc>:9: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Importance', y='Feature', data=top_10_features, palette="viridis")